Quarto Basics

First we load in the libraries and set defaults

import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
from itables import init_notebook_mode
init_notebook_mode(all_interactive=False)
from itables import show
from PIL import Image
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

pd.options.display.max_columns = None
This is the init_notebook_mode cell from ITables v2.1.3
(you should not see this message - is your notebook trusted?)

Now we read in the data files

years=list(range(1991, 2017))

years=[str(x) + ".txt" for x in years]
years=["Elements " + s for s in years]
years = ["C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/" + s for s in years]
years
['C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1991.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1992.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1993.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1994.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1995.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1996.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1997.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1998.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1999.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2000.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2001.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2002.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2003.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2004.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2005.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2006.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2007.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2008.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2009.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2010.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2011.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2012.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2013.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2014.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2015.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2016.txt']
files = years + ["C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 1-3.txt", "C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 4.txt"]
files
['C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1991.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1992.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1993.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1994.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1995.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1996.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1997.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1998.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 1999.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2000.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2001.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2002.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2003.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2004.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2005.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2006.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2007.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2008.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2009.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2010.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2011.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2012.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2013.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2014.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2015.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2016.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 1-3.txt',
 'C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/Elements 2017 MB 4.txt']
data_read = []

for file in files:
  data_read.append(pd.read_table(file, encoding = 'latin-1'))

data_read.append(pd.read_csv("C:/Users/carte/OneDrive/Documents/FDA/TDS/elements/2018_2020.csv"))
df = pd.concat(data_read)
df.shape
(429946, 37)
df.dtypes
MB                              float64
Food No.                        float64
Food Name                        object
Anal Type                        object
Sample Qualifier                 object
Replicate No.                    object
Element                          object
Conc                            float64
Unit                             object
Trace                            object
LOD                             float64
LOQ                             float64
Result Qualifier and Remarks     object
Method                           object
Instrument                       object
Batch ID                         object
METHOD                           object
Food No                         float64
ReplicateNo                      object
Reference Material               object
QC Level                         object
QC Unit                          object
QC% Recvd                        object
City ID                          object
Replicate No                    float64
FiscalYear                      float64
CalendarYear                    float64
Collection                      float64
TDSFoodNumber                   float64
TDSFoodDescription               object
Season                           object
TDSFoodListType                  object
Region                           object
Analyte                          object
Units                            object
Concentration                   float64
ReportingLimit                  float64
dtype: object

Exploring the data and doing some clean up

df.columns = df.columns.str.replace(" ", ".")
df.columns
Index(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
       'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
       'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
       'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
       'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
       'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
       'Season', 'TDSFoodListType', 'Region', 'Analyte', 'Units',
       'Concentration', 'ReportingLimit'],
      dtype='object')
df['Food.Name']=df['Food.Name'].str.replace('w/', 'with').str.upper()
#df['Food.Name'].unique()
df['Unit']=df['Unit'].str.replace(' ', '')
df['Unit'].unique()
array(['mg/kg', '%', 'ug/kg', nan], dtype=object)

Trying to understand the dataset. What is missing?

df.isnull().sum()
MB                               58079
Food.No.                         90743
Food.Name                        58079
Anal.Type                        58079
Sample.Qualifier                307204
Replicate.No.                   354905
Element                          58079
Conc                             58079
Unit                             58079
Trace                           370168
LOD                              59909
LOQ                              59909
Result.Qualifier.and.Remarks    319556
Method                          137807
Instrument                       58081
Batch.ID                        173749
METHOD                          350220
Food.No                         397282
ReplicateNo                     415226
Reference.Material              415226
QC.Level                        415226
QC.Unit                         415226
QC%.Recvd                       415226
City.ID                         429833
Replicate.No                    429469
FiscalYear                      371867
CalendarYear                    371867
Collection                      371867
TDSFoodNumber                   371867
TDSFoodDescription              371867
Season                          371867
TDSFoodListType                 371867
Region                          371868
Analyte                         371868
Units                           371868
Concentration                   371868
ReportingLimit                  371868
dtype: int64

What are the null entries? Trying to figure out if the difference in null units between Unit and Units

(pd.isnull(df['Units']) & pd.isnull(df['Unit'])).sum()
1

So there are is one row where Unit and Units are empty

df = df[~(pd.isnull(df['Units']) & pd.isnull(df['Unit']))].copy()
df.shape
(429945, 37)
pd.isnull(df['Units']).sum()
371867
pd.isnull(df['Unit']).sum()
58078
pd.isnull(df['Units']).sum() + pd.isnull(df['Unit']).sum()
429945
371867+58078
429945

So we are going to simply substitute for Unit and drop [‘Units’]

df.loc[df['Unit'].isnull()]['Unit'] = df.loc[df['Unit'].isnull()]['Units']
C:\Users\carte\AppData\Local\Temp\ipykernel_12728\2905289810.py:1: ChainedAssignmentError:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Units'], axis = 1)
(pd.isnull(df['Conc']) & pd.isnull(df['Concentration'])).sum()
0
df['Conc'].isnull().sum()
58078
df['Concentration'].isnull().sum()
371867
371867 + 58078
429945

So just like above we need to substitute for Conc

df.loc[df['Conc'].isnull()]['Conc'] = df.loc[df['Conc'].isnull()]['Concentration']
C:\Users\carte\AppData\Local\Temp\ipykernel_12728\3239790615.py:1: ChainedAssignmentError:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Concentration'], axis = 1)
(pd.isnull(df['Element']) & pd.isnull(df['Analyte'])).sum()
0
df['Element'].isnull().sum()
58078
df['Analyte'].isnull().sum()
371867
58078 + 371867
429945
df.loc[df['Element'].isnull()]['Element'] = df.loc[df['Element'].isnull()]['Analyte']
C:\Users\carte\AppData\Local\Temp\ipykernel_12728\695756292.py:1: ChainedAssignmentError:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df = df.drop(['Analyte'], axis = 1)
df.columns
Index(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
       'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
       'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
       'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
       'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
       'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
       'Season', 'TDSFoodListType', 'Region', 'ReportingLimit'],
      dtype='object')
df.shape
(429945, 34)

We need to also fix the Food No issue. There are two columns named Food No that we need to edit similar to what we did above. However, this time we need to index with integers

df.iloc[:, [1, 17]]
Food.No. Food.No
0 1.0 NaN
1 1.0 NaN
2 1.0 NaN
3 1.0 NaN
4 1.0 NaN
... ... ...
58073 NaN NaN
58074 NaN NaN
58075 NaN NaN
58076 NaN NaN
58077 NaN NaN

429945 rows × 2 columns

df.iloc[:, 1].isnull().sum() + df.iloc[:, 17].isnull().sum()
488023
df.iloc[:, 18].isnull().sum()
415225
df.loc[df.iloc[:, 1].isnull()].iloc[:, 1] = df.loc[df.iloc[:, 1].isnull()].iloc[:, 17]
C:\Users\carte\AppData\Local\Temp\ipykernel_12728\841977522.py:1: ChainedAssignmentError:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment.
When using the Copy-on-Write mode, such chained assignment never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

Try using '.loc[row_indexer, col_indexer] = value' instead, to perform the assignment in a single step.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#df = df.drop(df.columns[17], axis=1)
df.isnull().sum()
MB                               58078
Food.No.                         90742
Food.Name                        58078
Anal.Type                        58078
Sample.Qualifier                307203
Replicate.No.                   354904
Element                          58078
Conc                             58078
Unit                             58078
Trace                           370167
LOD                              59908
LOQ                              59908
Result.Qualifier.and.Remarks    319555
Method                          137806
Instrument                       58080
Batch.ID                        173748
METHOD                          350219
ReplicateNo                     415225
Reference.Material              415225
QC.Level                        415225
QC.Unit                         415225
QC%.Recvd                       415225
City.ID                         429832
Replicate.No                    429468
FiscalYear                      371867
CalendarYear                    371867
Collection                      371867
TDSFoodNumber                   371867
TDSFoodDescription              371867
Season                          371867
TDSFoodListType                 371867
Region                          371867
ReportingLimit                  371867
dtype: int64
df.columns
Index(['MB', 'Food.No.', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
       'Replicate.No.', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
       'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
       'METHOD', 'ReplicateNo', 'Reference.Material', 'QC.Level', 'QC.Unit',
       'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear', 'CalendarYear',
       'Collection', 'TDSFoodNumber', 'TDSFoodDescription', 'Season',
       'TDSFoodListType', 'Region', 'ReportingLimit'],
      dtype='object')
df['Conc_mgkg'] = np.where(df['Unit']=="ug/kg", df['Conc']/1000, df['Conc'])
df['Conc_mgkg'] = np.where(df['Unit']=="ppb", df['Conc']/1000, df['Conc'])
df = df.drop(['Conc'], axis = 1)
df['Element']=df.Element.str.upper()
df['Element'].unique()
array(['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
       'MAGNESIUM', 'MERCURY', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
       'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'IODINE', 'ANTIMONY',
       'ALUMINUM', 'MOLYBDENUM', 'WATER (ANALYSIS FOR TOTAL DIET ONLY)',
       'BARIUM', 'BERYLLIUM', 'CHROMIUM', 'DIMETHYLARSINIC ACID ',
       'MONOMETHYLARSONIC ACID ', 'SILVER', 'THALLIUM',
       'TOTAL INORGANIC ARSENIC ', 'URANIUM', 'DIMETHYLARSINIC ACID',
       'MONOMETHYLARSONIC ACID', 'STRONTIUM', 'TOTAL INORGANIC ARSENIC',
       'VANADIUM', nan], dtype=object)
df_percent = df.query('Unit == "%"').copy()

Looks like all the % entries are water analysis

df=df.query('Unit != "%"').copy()
df=df.rename(columns={"Food.No.": "Food.No",
                   "Replicate.No.": "Replicate.No"})
df.Element.unique()
array(['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
       'MAGNESIUM', 'MERCURY', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
       'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'IODINE', 'ANTIMONY',
       'ALUMINUM', 'MOLYBDENUM', 'BARIUM', 'BERYLLIUM', 'CHROMIUM',
       'DIMETHYLARSINIC ACID ', 'MONOMETHYLARSONIC ACID ', 'SILVER',
       'THALLIUM', 'TOTAL INORGANIC ARSENIC ', 'URANIUM',
       'DIMETHYLARSINIC ACID', 'MONOMETHYLARSONIC ACID', 'STRONTIUM',
       'TOTAL INORGANIC ARSENIC', 'VANADIUM', nan], dtype=object)
elements = ['ARSENIC', 'CADMIUM', 'CALCIUM', 'COPPER', 'IRON', 'LEAD',
       'MAGNESIUM', 'PHOSPHORUS', 'POTASSIUM', 'SELENIUM',
       'SODIUM', 'ZINC', 'MANGANESE', 'NICKEL', 'MOLYBDENUM', 'BARIUM', 'CHROMIUM',

       'THALLIUM',
       'STRONTIUM',
       ]
elements
['ARSENIC',
 'CADMIUM',
 'CALCIUM',
 'COPPER',
 'IRON',
 'LEAD',
 'MAGNESIUM',
 'PHOSPHORUS',
 'POTASSIUM',
 'SELENIUM',
 'SODIUM',
 'ZINC',
 'MANGANESE',
 'NICKEL',
 'MOLYBDENUM',
 'BARIUM',
 'CHROMIUM',
 'THALLIUM',
 'STRONTIUM']
df = df[df['Element'].isin(elements)].copy()
df.columns
Index(['MB', 'Food.No', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
       'Replicate.No', 'Element', 'Unit', 'Trace', 'LOD', 'LOQ',
       'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
       'METHOD', 'ReplicateNo', 'Reference.Material', 'QC.Level', 'QC.Unit',
       'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear', 'CalendarYear',
       'Collection', 'TDSFoodNumber', 'TDSFoodDescription', 'Season',
       'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg'],
      dtype='object')

To understand what the columns are referring to see https://www.fda.gov/food/fda-total-diet-study-tds/fda-total-diet-study-tds-results and https://www.fda.gov/media/93359/download?attachment

img = Image.open('C:/Users/carte/OneDrive/Documents/GitHub/FDA-TDS/elements_column_key.PNG')
display(img)

Some summarizing

df.groupby('Element').count()
MB Food.No Food.Name Anal.Type Sample.Qualifier Replicate.No Unit Trace LOD LOQ Result.Qualifier.and.Remarks Method Instrument Batch.ID METHOD ReplicateNo Reference.Material QC.Level QC.Unit QC%.Recvd City.ID Replicate.No FiscalYear CalendarYear Collection TDSFoodNumber TDSFoodDescription Season TDSFoodListType Region ReportingLimit Conc_mgkg
Element
ARSENIC 25228 23336 25228 25228 7309 4441 25228 3138 25220 25220 7142 20069 25228 16917 5159 1008 1008 1008 1008 1008 0 7 0 0 0 0 0 0 0 0 0 25228
BARIUM 15 11 15 15 15 11 15 0 15 15 14 15 15 11 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 15
CADMIUM 25501 23609 25501 25501 8010 5140 25501 5397 25501 25501 7933 20342 25501 17375 5159 1008 1008 1008 1008 1008 0 8 0 0 0 0 0 0 0 0 0 25501
CALCIUM 24411 22519 24411 24411 7578 4748 24411 384 24411 24411 5944 19252 24411 16087 5159 1008 1008 1008 1008 1008 0 29 0 0 0 0 0 0 0 0 0 24411
CHROMIUM 3807 2923 3807 3807 3722 1866 3807 171 3807 3807 3720 3807 3807 2747 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 3807
COPPER 24418 22526 24418 24418 7562 4723 24418 11948 24418 24418 5954 19259 24418 16095 5159 1008 1008 1008 1008 1008 0 24 0 0 0 0 0 0 0 0 0 24418
IRON 24427 22535 24427 24427 7612 4773 24427 6626 24427 24427 5953 19267 24426 16095 5159 1008 1008 1008 1008 1008 0 34 0 0 0 0 0 0 0 0 0 24427
LEAD 26414 24458 26414 26414 8166 5227 26414 4791 26414 26414 7725 20833 26414 18282 5581 1072 1072 1072 1072 1072 0 7 0 0 0 0 0 0 0 0 0 26414
MAGNESIUM 24418 22526 24418 24418 7561 4714 24418 583 24418 24418 5942 19259 24418 16088 5159 1008 1008 1008 1008 1008 0 22 0 0 0 0 0 0 0 0 0 24418
MANGANESE 25225 23332 25225 25225 8363 5497 25225 4502 25225 25225 6751 20066 25225 16893 5159 1008 1008 1008 1008 1008 0 9 0 0 0 0 0 0 0 0 0 25225
MOLYBDENUM 5832 4951 5832 5832 4760 2909 5832 412 4817 4817 5742 4807 5831 4752 1024 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 5832
NICKEL 18665 16772 18665 18665 4861 3044 18665 5670 18664 18664 5438 13505 18665 15513 5160 1008 1008 1008 1008 1008 2 16 0 0 0 0 0 0 0 0 0 18665
PHOSPHORUS 24413 22525 24413 24413 7544 4702 24413 569 24413 24413 5940 19254 24413 16085 5159 1008 1008 1008 1008 1008 0 20 0 0 0 0 0 0 0 0 0 24413
POTASSIUM 25214 23322 25214 25214 8342 5498 25214 336 25214 25214 6741 20055 25214 16885 5159 1008 1008 1008 1008 1008 0 23 0 0 0 0 0 0 0 0 0 25214
SELENIUM 21325 20440 21325 21325 3568 1714 21325 3931 21325 21325 3621 16166 21325 13089 5159 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 21325
SODIUM 25215 23323 25215 25215 8362 5521 25215 2974 25215 25215 6740 20057 25215 16884 5158 1008 1008 1008 1008 1008 0 23 0 0 0 0 0 0 0 0 0 25215
STRONTIUM 889 5 889 889 787 0 889 281 889 889 782 889 889 859 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0 0 0 889
THALLIUM 15 11 15 15 15 11 15 5 15 15 14 15 15 11 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 15
ZINC 24430 22538 24430 24430 7591 4747 24430 3147 24430 24430 5953 19271 24430 16095 5159 1008 1008 1008 1008 1008 0 30 0 0 0 0 0 0 0 0 0 24430
df.MB
0       199103.0
1       199103.0
2       199103.0
3       199103.0
4       199103.0
          ...   
1805    201704.0
1806    201704.0
1807    201704.0
1808    201704.0
1809    201704.0
Name: MB, Length: 349862, dtype: float64

From column key, we see the first four digits of MB are the year. The last two digits are the sepcific market basket. Let’s extract this information

df['Year'] = df.MB.astype(str).str[:4].astype(int)
df['Market_Basket'] = df.MB.astype(str).str[-4:]
df.shape
(349862, 35)
28200 + 321662
349862

After working with the features let’s see what is still missing in the data

df.isnull().sum()
MB                                   0
Food.No                          28200
Food.Name                            0
Anal.Type                            0
Sample.Qualifier                238134
Replicate.No                    280576
Element                              0
Unit                                 0
Trace                           294997
LOD                               1024
LOQ                               1024
Result.Qualifier.and.Remarks    251813
Method                           73674
Instrument                           2
Batch.ID                        113099
METHOD                          276190
ReplicateNo                     336694
Reference.Material              336694
QC.Level                        336694
QC.Unit                         336694
QC%.Recvd                       336694
City.ID                         349860
Replicate.No                    349562
FiscalYear                      349862
CalendarYear                    349862
Collection                      349862
TDSFoodNumber                   349862
TDSFoodDescription              349862
Season                          349862
TDSFoodListType                 349862
Region                          349862
ReportingLimit                  349862
Conc_mgkg                            0
Year                                 0
Market_Basket                        0
dtype: int64
df[['Food.No', 'Food.Name',
       'Replicate.No', 'Element', 'Conc', 'Trace', 'LOD', 'LOQ',
       'METHOD', 'Food.No', 'ReplicateNo', 'Replicate.No', 'FiscalYear',
       'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
       'Season', 'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg',
       'Year', 'Market_Basket']]
Food.No Food.No Food.Name Replicate.No Replicate.No Element Conc Trace LOD LOQ METHOD Food.No Food.No ReplicateNo Replicate.No Replicate.No FiscalYear CalendarYear Collection TDSFoodNumber TDSFoodDescription Season TDSFoodListType Region ReportingLimit Conc_mgkg Year Market_Basket
0 1.0 NaN WHOLE MILK, FLUID NaN NaN ARSENIC 0.00 NaN 0.050 0.140 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.00 1991 03.0
1 1.0 NaN WHOLE MILK, FLUID NaN NaN CADMIUM 0.00 NaN 0.002 0.005 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.00 1991 03.0
2 1.0 NaN WHOLE MILK, FLUID NaN NaN CALCIUM 261.00 NaN 3.000 10.000 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 261.00 1991 03.0
3 1.0 NaN WHOLE MILK, FLUID NaN NaN COPPER 0.00 NaN 0.250 0.900 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.00 1991 03.0
4 1.0 NaN WHOLE MILK, FLUID NaN NaN IRON 0.00 NaN 0.700 3.000 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.00 1991 03.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1805 NaN 400.0 TORTILLA, CORN NaN NaN ZINC 10.10 NaN 0.198 1.780 NaN NaN 400.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.10 2017 04.0
1806 NaN 401.0 BREAD, WHITE ROLL/BUN (HAMBURGER/HOTDOG) NaN NaN ZINC 7.33 NaN 0.198 1.780 NaN NaN 401.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.33 2017 04.0
1807 NaN 402.0 RICE, BROWN, COOKED NaN NaN ZINC 11.60 NaN 0.198 1.780 NaN NaN 402.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11.60 2017 04.0
1808 NaN 403.0 WALNUTS, SHELLED NaN NaN ZINC 28.30 NaN 0.198 1.780 NaN NaN 403.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 28.30 2017 04.0
1809 NaN 404.0 PIZZA, CHEESE, FAST-FOOD NaN NaN ZINC 14.50 NaN 0.198 1.780 NaN NaN 404.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 14.50 2017 04.0

349862 rows × 28 columns

df.columns
Index(['MB', 'Food.No', 'Food.Name', 'Anal.Type', 'Sample.Qualifier',
       'Replicate.No', 'Element', 'Conc', 'Unit', 'Trace', 'LOD', 'LOQ',
       'Result.Qualifier.and.Remarks', 'Method', 'Instrument', 'Batch.ID',
       'METHOD', 'Food.No', 'ReplicateNo', 'Reference.Material', 'QC.Level',
       'QC.Unit', 'QC%.Recvd', 'City.ID', 'Replicate.No', 'FiscalYear',
       'CalendarYear', 'Collection', 'TDSFoodNumber', 'TDSFoodDescription',
       'Season', 'TDSFoodListType', 'Region', 'ReportingLimit', 'Conc_mgkg',
       'Year', 'Market_Basket'],
      dtype='object')
ax = sns.boxplot(df[df.Conc > df.LOQ], x='Element', y = 'Conc')
ax.set_yscale("log")
ax.tick_params(axis='x', rotation=90)

fig = px.box(df[df.Conc > df.LOQ], x='Element', y = 'Conc', log_y=True)
fig.show()